#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'wshu'
__version__ = '1.0'
"""
    ***********************************
    *  @filename : get_domains.py
    *  @Author : wshu
    *  @CodeDate : 2020/4/16 22:04
    *  @Software : PyCharm
    ***********************************
    根据IP地址反查域名
"""

import re
import time
import traceback
from urllib.request import Request, urlopen
from urllib.parse import urlencode
import urllib
import json

import logging

logger = logging.getLogger('sechfm.core.hostscan')

_search_interval = 20

g_headers = ("User-Agent",
             "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7 (.NET CLR 3.5.30729)")

MAX_PAGE = 10


def getHTML(url):
    '''获取网站html数据'''
    req = Request(url)
    req.add_header(*g_headers)
    usock = urlopen(req)
    html = usock.read()
    return html


def getHTMLByPost(url, data):
    '''post提交数据，并获取返回的HTML'''
    data = urlencode(data)
    req = Request(url, data)
    req.add_header(*g_headers)
    usock = urlopen(req)
    html = usock.read()
    return html


class Domains:
    '''通过不同的查询接口，反查ip对应的域名列表
    '''

    def __init__(self, ip):
        self.ip = ip

    def lookup_webhosting(self):
        '''whois.webhosting.info
        '''
        domainList = []
        target = 'http://whois.webhosting.info/' + self.ip
        try:
            html = getHTML(target)
        except Exception as e:
            logger.exception(e)
            return []

        pageLast = 1
        page = re.compile(
            r'''<a\s+href\s*=\s*['"]?/''' + self.ip + '''\?pi=([\d]+)&ob=SLD&oo=ASC['"]?>(?:&nbsp;)*Last''', re.I)
        pageList = page.findall(html)
        if pageList:
            pageLast = int(pageList[0])

        p = re.compile(r'''<td><a\s+href\s*=\s*['"]?\s*http://whois\.webhosting\.info\/([\d\w\.\-]+?)\s*['"]?>''', re.I)
        domainList = p.findall(html)

        if pageLast > MAX_PAGE:
            pageLast = MAX_PAGE

        for pageno in range(pageLast - 1):
            target = 'http://whois.webhosting.info/' + self.ip + '?pi=' + str(pageno + 2) + '&ob=SLD&oo=ASC'

            try:
                html = getHTML(target)
                domainL = p.findall(html)
                domainList += domainL
            except Exception as e:
                logger.exception(e)
                time.sleep(_search_interval)
                traceback.print_exc()
                continue

        ret = [d.strip('.').lower() for d in domainList]

        return ret

    def lookup_bing(self):
        '''cn.bing.com
        '''

        domainList = []

        target = 'http://cn.bing.com/search?q=ip%3A' + self.ip + '&go=&form=QBLH&filt=all&qs=n'

        try:
            html = getHTML(target)
        except Exception as e:
            return []

        p = re.compile(r'''<h2><a\s+href\s*=\s*['"]?(?:http|https)://([\d\w\.\-]+)/?''', re.I)
        domainList = p.findall(html)

        if not domainList:
            return []

        # hard code, if bing.com altered, then this should be updated
        page = re.compile(r'''\s?([\d,]+)\s+条结果''', re.I)
        pageStr = page.findall(html)[0]
        rowCount = int(pageStr.replace(',', ''))
        page_size = 10

        pageTotal = rowCount / page_size
        if rowCount % page_size > 0:
            pageTotal += 1

        if pageTotal > MAX_PAGE:
            pageTotal = MAX_PAGE

        for pageno in range(pageTotal - 1):
            pageno = pageno + 2
            first = pageno * 10 - 9
            target = 'http://cn.bing.com/search?q=ip%3A' + self.ip + '&filt=all&first=' + str(first) + '&FORM=PERE'

            try:
                html = getHTML(target)
                domainL = p.findall(html)
                domainList += domainL
            except Exception as e:
                logger.exception(e)
                time.sleep(_search_interval)
                continue

        return domainList

    def lookup_aizhan(self):
        '''http://dns.aizhan.com/
        '''
        domainList = []

        target = 'http://dns.aizhan.com/index.php?r=index/domains&ip=%s&page=1' % self.ip

        try:
            html = getHTML(target)
        except Exception as e:
            logger.exception(e)
            return []
        try:
            r_html = json.loads(html)
        except ValueError as e:
            logger.exception(e)
            return []
        # FIXME 缺少分页爬取功能，数据不完整
        # FIXME 检查限制功能已经被清理，之后再还原
        domainList = r_html['domains']
        return domainList

    def lookup_domainbyip(self):
        '''domainbyip.com
        '''
        target = 'http://domainbyip.com/'
        data = {'ip': self.ip}

        try:
            html = getHTMLByPost(target, data)
        except Exception as e:
            logging.exception(e)
            return []

        p = re.compile(
            r'''<li\s+class\s*=\s*['"]?site[\w]+['"]?><a\s+href\s*=\s*['"]?(?:http|https)://([\d\w\.\-]+)[/'"]?''',
            re.I)
        domainList = p.findall(html)
        return domainList

    def lookup_myipneighbors(self):
        '''www.myipneighbors.com
        由于国外的反查源使用AWS，受干扰,暂时禁用
        '''
        target = 'http://www.myipneighbors.com/search/check'
        data = {'search': self.ip, 'commit': 'search'}

        try:
            html = getHTMLByPost(target, data)
        except Exception as e:
            logger.exception(e)
            return []

        p = re.compile(
            r'''<tr\s+style\s*=\s*['"]?background:#(?:ffffff|dedede);['"]?><td><a\s+href=\s*['"]?\s*(?:http|https)://([\d\w\.\-]+)[/'"\s]?''',
            re.I)
        domainList = p.findall(html)

        return domainList

    def lookup_chinaz(self):
        '''http://s.tool.chinaz.com/Same/
        '''

        domainList = []

        target = 'http://s.tool.chinaz.com/Same/'
        data = {'s': self.ip}
        try:
            html = getHTMLByPost(target, data)
            p = re.compile(r'''</span>\s+<a\s+href='[\d\w\.\-/:]+'\s+target=_blank>([\d\w\.\-]+)</a>''', re.I)
            domainList = p.findall(html)
        except Exception as e:
            logger.exception(e)
            traceback.print_exc()

        return domainList


def _get_domain(func):
    domain_ls = []

    try:
        domain_ls = func()
    except Exception as e:
        logger.exception(e)
        print('lookup error: %s' % e)
    logger.info('domain query function: %s len: %d results : %s' % (func.func_name, len(domain_ls), domain_ls))
    return domain_ls


def main(ip):
    """DNS反差接口函数
    @note: look all funcs to find all domains
    1、html：从指定查询网站爬取domains
    """
    domainList = []

    d = Domains(ip)
    domainList.extend(_get_domain(d.lookup_aizhan))
    domainList.extend(_get_domain(d.lookup_chinaz))
    domainList.extend(_get_domain(d.lookup_bing))
    # 优先级低
    # FIXME 站点‘www.chaxp.com’不可用
    domainList.extend(_get_domain(d.lookup_domainbyip))
    domainList.extend(_get_domain(d.lookup_webhosting))

    domainList = list(set(domainList))
    return domainList